This is the next step after creating the required dataset for the project. To look into the data, find the anomalies, outliers, Not available values, and prepare it for the modeling

load("validation_test.rda")
head(dataset, n=10)
##    amb_temp      lat       lon population  LST Day_population    imprev
## 2      -3.3 41.85814 -87.61606       2515  3.9           3009 0.5169038
## 3      -7.2 41.81034 -87.59023       1654  1.1           5496 0.9253708
## 6      -1.1 41.85218 -87.67583       1436  2.2           2275 0.3241619
## 7       0.0 41.73631 -87.62418       3971  2.8           4864 0.2578571
## 8       0.0 41.76832 -87.68340       2878  5.0           4866 0.2573717
## 9       0.0 41.85780 -87.68581       3920  3.3           6008 0.4224022
## 10     -7.2 41.72246 -87.57535       3985  1.7           2490 0.8199707
## 11      5.6 41.73649 -87.61453       3400 12.2           1007 0.5233647
## 12     -3.3 41.96509 -87.67908       2898  4.4           1505 0.7719977
## 14     -6.7 41.83258 -87.64613       2251  0.6           1105 0.4883683
##    Land_Cover
## 2          10
## 3           3
## 6           6
## 7           7
## 8           3
## 9           9
## 10          4
## 11         10
## 12          2
## 14          2
summary(dataset)
##     amb_temp            lat             lon           population  
##  Min.   :-11.100   Min.   :41.69   Min.   :-87.76   Min.   : 786  
##  1st Qu.: -8.175   1st Qu.:41.79   1st Qu.:-87.68   1st Qu.:1700  
##  Median : -5.850   Median :41.88   Median :-87.66   Median :2544  
##  Mean   : -4.679   Mean   :41.85   Mean   :-87.66   Mean   :2956  
##  3rd Qu.: -1.108   3rd Qu.:41.91   3rd Qu.:-87.63   3rd Qu.:3982  
##  Max.   :  5.600   Max.   :41.97   Max.   :-87.54   Max.   :7868  
##                                                                   
##       LST         Day_population     imprev         Land_Cover
##  Min.   :-5.600   Min.   : 676   Min.   :0.1071   10     :12  
##  1st Qu.:-0.600   1st Qu.:1887   1st Qu.:0.3879   9      :10  
##  Median : 1.400   Median :2786   Median :0.5412   4      : 8  
##  Mean   : 1.158   Mean   :3103   Mean   :0.5421   11     : 7  
##  3rd Qu.: 3.300   3rd Qu.:4144   3rd Qu.:0.7291   6      : 7  
##  Max.   :12.200   Max.   :7796   Max.   :0.9254   7      : 7  
##                                                   (Other):23

Let’s look into Boxplots of comparison of similar variables:

library(ggplot2)
library(plotly)
library(tidyverse)

# tidy the data for visualization
data.tidy <- dataset[,1:7] %>% gather(key = "parameter", value = "value", -c(lat, lon))
bxp.tmeps <- ggplot(data.tidy, aes(x=parameter, y=value))+
  geom_boxplot()+facet_wrap(~parameter, scales = "free_y")
bxp.tmeps <- ggplotly(bxp.tmeps)
bxp.tmeps

Looking into the histograms of data

hist.temps <- ggplot(data.tidy, aes(value))+
  geom_histogram()+facet_wrap(~parameter, scales = "free_x")
h <- ggplotly(hist.temps)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
h

Now let’s take a look into the correlations between the variables

library(corrplot)
data_to_corr <- dataset[, -c(2,3)]
data_to_corr$Land_Cover <- as.numeric(data_to_corr$Land_Cover)
correlations <- cor(data_to_corr)
corrplot(correlations, method="circle")

Now should consider feature selection and possibly to remove the most correlated attributes.